Purpose

The purpose of this document is to show my approach to understanding precipitation in Santa Barbara county by accessing and analyzing precipitation data sets and then applying that analysis to understand its effect on the Cachuma Reservoir.

Precipitation

Gathering Data

Rainfall data was gathered from the Santa Barbara County Website.

This data consisted of 81 separate .xls files, each from a different rainfall gauge in Santa Barbara County, which I put into a single folder.

I then created a function to read and clean these files. A for loop was used to apply this function to all 81 files and put all of the data into a single data frame

A separate function and for loop was used to extract the location data for each rainfall gauge.

After this process I had two data frames, one with daily rainfall totals for each date based on station id and another with the latitude and longitude coordinates of each station (rainfall gauge).

# Precipitation data
precip_files <- list.files(path = here("precip"))  
  
read_precip <- function(file) {
  df <- read_excel(here("precip", file), sheet = 1, skip = 9)
  colnames(df) <- c("station_id", "water_year", "year", "month", "day", "daily_rain", "code")
  df <- df %>%   unite(date, c(year, month, day), sep = "-") %>% 
  mutate(date = ymd(date)) %>% 
  clean_names() %>% 
  select(station_id, date, daily_rain)
  return(df)
}

combined <- data.frame()

for (i in precip_files) {
  data <- read_precip(i)
  combined <- rbind(combined, data)
}

# Precipitation locations
meta <- function(file) {
  read_excel(here("precip", file), sheet = 1, skip = 3, n_max = 2) %>% 
  pivot_longer(cols = everything(), names_to = "id", values_to = "coord")
}

combined_meta <- data.frame()

for (i in precip_files) {
  data <- meta(i)
  combined_meta <- rbind(combined_meta, data)
}

combined_meta2 <- combined_meta %>% 
  separate(col = id, into = c("a", "b", "c"), sep = " ") %>% 
  mutate(a = str_replace(a, "#", "")) %>% 
  separate(col = coord, into = c("lat", "lon", "elev"), sep = ",") %>% 
  mutate(lat = str_replace(lat, "Lat ", ""),
         lon = str_replace(lon, "Long ", ""), 
         elev = str_replace(elev, "Elev ", ""),
         elev = str_replace(elev, "ft", "")) %>% 
  rename(id = a) %>% 
  select(id, lat, lon, elev) %>% 
  mutate(id = str_replace(id, " ", ""), lat = str_replace(lat, " ", ""), lon = str_replace(lon, " ", ""), elev = str_replace(elev, " ", "")) %>% 
  separate(lat, into = c("latA", "latB", "latC"), sep = "-") %>% 
  separate(lon, into = c("lonA", "lonB", "lonC"), sep = "-") %>% 
  mutate(latA = as.numeric(latA), latB = as.numeric(latB), latC = as.numeric(latC), lonA = as.numeric(lonA), lonB = as.numeric(lonB), lonC = as.numeric(lonC)) %>% 
  mutate(lat_deg_dec = latA + (latB/60) + (latC/3600), lon_deg_dec = lonA + (lonB/60) + (lonC/3600)) %>% 
  rename(lat = lat_deg_dec, lon = lon_deg_dec) %>% 
  select(id, lat, lon, elev) %>% 
  rename(station_id = id) %>% 
  mutate(station_id = as.numeric(station_id)) %>% 
  mutate(elev = as.numeric(elev)) %>% 
  mutate(lon = lon*-1)

# finalized data frames
precip <- combined %>% 
  drop_na()

meta <- combined_meta2
precip_withcoord <- left_join(precip, meta, by = "station_id")

Wrangling the Data

The next step was to wrangle this data to produce multiple different data frames which can be used for analysis.

# daily precip averaged across the entire county
precip_avg <- precip %>% 
  group_by(date) %>% 
  summarize(avg_rain = mean(daily_rain)) %>% 
  mutate(year = year(date))

# precip yearly totals based on station id
precip_year_all <- precip %>% 
  mutate(year = year(date)) %>% 
  group_by(station_id, year) %>% 
  summarize(year_precip = sum(daily_rain))

# precip yearly totals averaged over entire county
precip_year_avg <- precip_year_all %>% 
  ungroup() %>% 
  group_by(year) %>% 
  summarize(avg_precip = mean(year_precip))

# precip monthly totals based on station id
precip_month_all <- precip %>% 
  mutate(month = month(date), year = year(date)) %>% 
  unite(monthyear, c(month, year), sep = "-") %>% 
  mutate(monthyear = my(monthyear)) %>% 
  group_by(station_id, monthyear) %>% 
  summarize(month_precip = sum(daily_rain)) %>% 
  mutate(year = year(monthyear))

# precip monthly totals averaged over entire county
precip_month_avg <- precip_month_all %>% 
  ungroup() %>% 
  group_by(monthyear) %>% 
  summarize(avg_precip = mean(month_precip)) %>% 
  mutate(year = year(monthyear))

Plotting the points

I first used ggmap::get_stamenmap() to get a map of Santa Barbara County

I then created another data frame with yearly rainfall totals averaged over all the years in the data set based on station id. This allowed me to plot these points onto the map

### get a basemap
sb_map <- get_stamenmap(bbox = c(left = -120.65,
                                    bottom = 34.39694,
                                    right = -119.4622,
                                    top = 35.07694),
          maptype = "terrain", 
          crop = TRUE,
          zoom = 10)

# get yearly precipitation average for all years for each station
precip_year_plot <- precip_year_all %>% 
  ungroup() %>% 
  group_by(year, station_id) %>% 
  summarize(year_precip) %>% 
  left_join(meta, by = "station_id") %>% 
  ungroup() %>% 
  group_by(station_id, lat, lon) %>% 
  summarize(year_precip = mean(year_precip))

#create a plot of points on a map of Santa Barbara County
ggmap(sb_map, darken = .2) +
  geom_point(data = precip_year_plot, mapping = aes(x = lon, y = lat, color = year_precip), size = 2) +
  scale_color_gradient(low = "lightblue", high = "blue4") +
  labs(title = "Yearly Precipitation in Santa Barbara County", color = "Yearly Precipitation (in)") +
  theme(legend.position = "bottom")

Kriging

The next step was to spatially interpolate the data to get a better idea of the precipitation in the entire county.

I decided to use the form of spatial interpolation known as Ordinary Kriging.

The first step in this process was to create a variogram, which describes the spatial dependence. This was done using the gstat::variogram(). The function automap::autofitVariogram() was used to choose the model that best fits the data. Next, after defining a target grid, the gstat::krige() function was used to generate the set of predictions.

precip_sf <- st_as_sf(precip_year_plot, coords = c("lon", "lat"), crs = 4326) %>% 
  cbind(st_coordinates(.))

v_emp_OK <- gstat::variogram(
  year_precip~1,
  as(precip_sf, "Spatial"))

v_mod_full <- automap::autofitVariogram(year_precip~1, as(precip_sf, "Spatial"))

v_mod <- v_mod_full$var_model

grd_sf <- precip_sf %>% 
  st_bbox() %>% 
  st_as_sfc() %>% 
  st_make_grid( 
  cellsize = c(.001, .001), 
  what = "centers"
  ) %>%
  st_as_sf() %>% 
  cbind(., st_coordinates(.))

grd_sp <- as(grd_sf, "Spatial") 
gridded(grd_sp) <- TRUE             
grd_sp <- as(grd_sp, "SpatialPixels") 

OK <- krige(
  year_precip~1,                      
  as(precip_sf, "Spatial"), 
  grd_sp,                
  model = v_mod         
  )
## [using ordinary kriging]
df <- rasterToPoints(raster(OK)) %>% 
  as_tibble()
colnames(df) <- c("X", "Y", "Z")

OK_plot <- ggmap(sb_map, darken = .2) +
  geom_raster(data = df, aes(x = X, y = Y, fill = Z)) +
  coord_cartesian() +
  scale_fill_gradient2(low = "transparent", mid = "blue4", high = "blue4", midpoint = 30) +
  labs(title = "Yearly Precipitation in Santa Barbara County", fill = "Yearly Precipitation (in)") +
  theme(legend.position = "bottom")

OK_plot

To get a better understanding of how precipitation has been changing over the years, looped this process over a subset of years

krig_precip <- function(desired_year) {
  precip_year_plot <- precip_year_all %>% 
  ungroup() %>% 
  group_by(year, station_id) %>% 
  summarize(year_precip) %>% 
  left_join(meta, by = "station_id") %>% 
  filter(year == desired_year)
  
  precip_sf <- st_as_sf(precip_year_plot, coords = c("lon", "lat"), crs = 4326) %>% 
  cbind(st_coordinates(.))

v_emp_OK <- gstat::variogram(
  year_precip~1,
  as(precip_sf, "Spatial"))

v_mod_full <- automap::autofitVariogram(year_precip~1, as(precip_sf, "Spatial"))

v_mod <- v_mod_full$var_model

grd_sf <- precip_sf %>% 
  st_bbox() %>% 
  st_as_sfc() %>% 
  st_make_grid( 
  cellsize = c(.001, .001), 
  what = "centers"
  ) %>%
  st_as_sf() %>% 
  cbind(., st_coordinates(.))

grd_sp <- as(grd_sf, "Spatial") 
gridded(grd_sp) <- TRUE             
grd_sp <- as(grd_sp, "SpatialPixels") 

OK <- krige(
  year_precip~1,                      
  as(precip_sf, "Spatial"), 
  grd_sp,                
  model = v_mod         
  )

df <- rasterToPoints(raster(OK)) %>% 
  as_tibble()
colnames(df) <- c("X", "Y", "Z")

OK_plot <- ggmap(sb_map, darken = .2) +
  geom_raster(data = df, aes(x = X, y = Y, fill = Z)) +
  coord_cartesian() +
  scale_fill_gradient(low = "transparent", high = "blue4", limits = c(0,55)) +
  labs(title = paste(desired_year), fill = "in_rain") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), legend.position = "bottom")

return(OK_plot)
}

for (i in 2015:2021) {
  plot <- krig_precip(i)
  assign(paste0("plot_", i), plot)
}
## [using ordinary kriging]
## [using ordinary kriging]
## [using ordinary kriging]
## [using ordinary kriging]
## [using ordinary kriging]
## [using ordinary kriging]
## [using ordinary kriging]
plot_county <- ggplot(filter(precip_year_avg, year >= 2015), aes(x = year, y = avg_precip)) +
  geom_col(color = "blue4", fill = "blue4") +
  labs(title = "Countywide Average Rain (in)", x = "Year") +
  theme(axis.text.y = element_blank(), plot.title = element_text(size = 8))
  


grid.arrange(plot_2015, plot_2016, plot_2017, plot_2018, plot_2019, plot_2020, plot_2021, plot_county, nrow = 2)

Cachuma Reservoir

With the precipitation data sorted out I can now apply this to understanding to the effect precipitation on the water level of the Cachuma Reservoir. The Cachuma Reservoir is heavily relied on by the city of Santa Barbara, which is entitled to 32.19% of its available water. Understanding how rainfall affects this reservoir is vitally important.

Reservoir data

I found reservoir level data for the Cachuma Reservoir on the County of Santa Barbara Public Works website.

They provide fairly consistent data going back to the year 2015. reservoir level are measured at 15 minute intervals in the units of ft.

## Cachuma data read in

url1 <- "https://rain.cosbpw.net/export/file/?site_id=105&site=70729dd9-97d4-430a-9271-7b6c195b49be&device_id=1&device=5d7a3129-708d-4881-9886-f84c6686ab41&mode=&hours=&data_start="

url2 <- "-01-01%2000:00:00&data_end="

url3 <- "-12-31%2023:59:59&tz=US%2FPacific&format_datetime=%25Y-%25m-%25d+%25H%3A%25i%3A%25S&mime=txt&delimiter=tab"

read_cachuma <- function(year) {
  df <- read_delim(paste0(url1, year, url2, year, url3), delim = "\t")
  df <- df %>% mutate(Reading = ymd_hms(Reading)) %>% 
  mutate(date = lubridate::as_date(Reading)) %>% 
  group_by(date) %>% 
  summarize(level = mean(Value)) %>% 
  mutate(year = year(date))
  return(df)
}

years_cachuma <- c("2021", "2020", "2019", "2018", "2017", "2016", "2015", "2013", "2012")

cachuma <- data.frame()

for (i in years_cachuma) {
  data <- read_cachuma(i)
  cachuma <- rbind(cachuma, data)
}

cachuma <- cachuma %>% 
  arrange(date)

cachuma$level_change = NA

for (i in 2:length(cachuma$date)) {
  if ((cachuma$date[i] - cachuma$date[i-1]) == 1) {
    cachuma$level_change[i] <- cachuma$level[i] - cachuma$level[i-1]
  }
}

### cachuma daily
cachuma_precip <- cachuma %>% 
  full_join(precip_avg) %>% 
  filter(year >= 2015) %>% 
  arrange(date)

cachuma_precip_all <- cachuma %>% 
  full_join(precip) %>% 
  filter(year >= 2015) %>% 
  arrange(date)

### cachuma monthly
cachuma_precip_month <- cachuma %>% 
  mutate(month = month(date)) %>% 
  unite(monthyear, c(month, year), sep = "-") %>% 
  mutate(monthyear = my(monthyear)) %>% 
  group_by(monthyear) %>% 
  summarize(monthly_level_change = sum(level_change)) %>% 
  full_join(precip_month_avg) %>% 
  filter(year >= 2015) %>% 
  arrange(monthyear)

### cachuma monthly all
cachuma_precip_month_all <- cachuma %>% 
  mutate(month = month(date)) %>% 
  unite(monthyear, c(month, year), sep = "-") %>% 
  mutate(monthyear = my(monthyear)) %>% 
  group_by(monthyear) %>% 
  summarize(monthly_level_change = sum(level_change)) %>% 
  full_join(precip_month_all) %>% 
  filter(year >= 2015) %>% 
  arrange(monthyear) 

With this data, I was able to make some quick plots

cachuma_quick_1 <- ggplot(cachuma_precip, aes(x = date)) +
  geom_line(aes(y = level)) +
  scale_y_continuous(limits = c(600, 800)) +
  labs(title = "Reservoir Level (ft)", x = "Date") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), plot.title = element_text(size = 9))

cachuma_quick_2 <- ggplot(cachuma_precip, aes(x = date)) +
  geom_line(aes(y = level_change)) +
  labs(title = "Change in Reservoir Level (ft)", x = "Date")+
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), plot.title = element_text(size = 9))


cachuma_quick_3 <- ggplot(cachuma_precip_month, aes(x = monthyear, y = monthly_level_change)) +
  geom_col() +
  labs(title = "Monthly Change in Reservoir Level (ft)", x = "Date")+
  theme(axis.title.x = element_blank(), axis.title.y = element_blank(), plot.title = element_text(size = 9))


cachuma_quick_1 + cachuma_quick_2 + cachuma_quick_3

Statistical Analysis

More Kriging

After matching up the reservoir data with the precipitation data, I wanted to see which stations recorded rainfall levels that correlated most with changes in reservoir water level. To do this, I created a function that assigns an r squared value to a station based on the results of a linear regression analysis of the effect of monthly total rainfall at the station and the monthly change in reservoir level. I looped this function over all of the 81 stations and put the results into a new data frame. This linear regression was done with the lm() function.

station <- function(station) {
  station_eval <- cachuma_precip_month_all %>% 
    filter(station_id == station)
  model <- lm(monthly_level_change~month_precip, data = station_eval)
  sum_model <- summary(model)
  r_sq <- sum_model$r.squared
  return(r_sq)
}

stations <- unique(cachuma_precip_month_all$station_id)

station_df <- data.frame(station_id = stations, r_sq = NA) %>% 
  arrange(station_id)

for (i in 1: length(station_df$station_id)) {
  station_df$r_sq[i] <- station(station_df$station_id[i])
}

station_df <- station_df %>% 
  left_join(meta)

station_df <- select(station_df, -elev)

With these r squared values, I thought a good way to visualize this would be with another spatial interpolation, this time measuring the correlation between rainfall and reservoir level.

correlation_sf <- st_as_sf(station_df, coords = c("lon", "lat"), crs = 4326) %>% 
  cbind(st_coordinates(.))

v_emp_OK <- gstat::variogram(
  r_sq~1,
  as(correlation_sf, "Spatial"))

v_mod_full <- automap::autofitVariogram(r_sq~1, as(correlation_sf, "Spatial"))

v_mod <- v_mod_full$var_model

grd_sf <- correlation_sf %>% 
  st_bbox() %>% 
  st_as_sfc() %>% 
  st_make_grid( 
  cellsize = c(.001, .001), 
  what = "centers"
  ) %>%
  st_as_sf() %>% 
  cbind(., st_coordinates(.)) 

grd_sp <- as(grd_sf, "Spatial") 
gridded(grd_sp) <- TRUE             
grd_sp <- as(grd_sp, "SpatialPixels")

OK <- krige(
  r_sq~1,                      
  as(correlation_sf, "Spatial"), 
  grd_sp,                
  model = v_mod         
  )
## [using ordinary kriging]
df <- rasterToPoints(raster(OK)) %>% 
  as_tibble()
colnames(df) <- c("X", "Y", "Z")

OK_plot <- ggmap(sb_map, darken = .2) +
  geom_raster(data = df, aes(x = X, y = Y, fill = Z)) +
  coord_cartesian() +
  scale_fill_gradient2(low = "transparent", mid = "transparent", high = "red", midpoint = 0.45) +
  theme(legend.position = "bottom")+
  geom_text(data = filter(station_df, r_sq == max(r_sq)), inherit.aes = FALSE, aes(x = lon, y = lat, label = station_id)) +
  labs(title = "Correlation with Cachuma Reservoir Level", fill = "R Squared Value") +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank())

OK_plot

From this plot, it was clear to see which area had the most correlation.
I was also able to pinpoint which station was most highly correlated: station 238.
Plotting it in this way was a good way to check and make sure that this correlation makes sense. Station 238 does appear to be in an area that would drain into the Cachuma Reservoir.

I chose to use station 238 for the remainder of my analysis.

station_238_monthly <- cachuma_precip_month_all %>% 
  filter(station_id == 238)

station_238_daily <- cachuma_precip_all %>% 
  filter(station_id == 238)

Here is the summary of the linear regression model for station 238.

model <- lm(monthly_level_change~ month_precip, data = station_238_monthly)
model_summary <- summary(model)
model_summary
## 
## Call:
## lm(formula = monthly_level_change ~ month_precip, data = station_238_monthly)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4507  -3.4098   0.6043   2.5695  23.1083 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -3.9692     0.9938  -3.994 0.000232 ***
## month_precip   2.8971     0.3206   9.037 9.19e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.249 on 46 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.6397, Adjusted R-squared:  0.6319 
## F-statistic: 81.67 on 1 and 46 DF,  p-value: 9.191e-12

I then put together a graph of the data with the linear model, as well as graphs of the residuals.

model_plot <- ggplot(station_238_monthly, aes(x = month_precip, y = monthly_level_change)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Monthly Precipitation", y = "Monthly Change in Reservoir Level") +
  theme(axis.title.y = element_text(size = 8), axis.title.x = element_text(size = 10))

predictions <- station_238_monthly %>% 
  add_predictions(model) %>% 
  mutate(residuals = monthly_level_change - pred)

res_hist <- ggplot(predictions) +
  geom_histogram(aes(residuals))

res_point <- ggplot(predictions) +
  geom_point(aes(x = month_precip, y = residuals))

model_plot / (res_hist + res_point)

Plots

Revisiting the plots I made of the Cachuma Reservoir water levels earlier, I could now confidently add the precipitation data from station 238.

final_daily_plot <- ggplot(station_238_daily, aes(x = date)) +
  geom_rect(aes(ymin=600, ymax = 800, xmin = date-7, xmax=date+7, fill = daily_rain)) +
  scale_fill_gradient(low = "transparent", high = "royalblue1", na.value = "transparent") +
  geom_line(aes(y = level)) +
  scale_y_continuous(limits = c(600, 800)) +
  labs(y = "Reservoir Level (ft)", fill = "Daily Rain (in)") +
  theme(axis.title.x = element_blank())

final_daily_change_plot <- ggplot(station_238_daily, aes(x = date)) +
  geom_rect(aes(ymin=-5, ymax = 20, xmin = date-7, xmax=date+7, fill = daily_rain)) +
  scale_fill_gradient(low = "transparent", high = "royalblue1", na.value = "transparent") +
  geom_line(aes(y = level_change)) +
  labs(y = "Level Change (ft)", fill = "Daily Rain") +
  theme(axis.title.x = element_blank())
  


final_monthly_plot <- ggplot(station_238_monthly) +
  geom_rect(aes(ymin=-10, ymax = 50, xmin = monthyear, xmax=monthyear+30, fill = month_precip), alpha = 0.5) +
  scale_fill_gradient2(low = "transparent", mid = "royalblue1", high = "darkblue", midpoint = 5, na.value = "transparent") +
  geom_col(aes(y = monthly_level_change, x = monthyear + 15)) +
  labs(y = "Level Change (ft)", fill = "Monthly Rain") +
  theme(axis.title.x = element_blank())


final_daily_plot / (final_daily_change_plot + final_monthly_plot)